# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


df_loanData = pd.read_csv('prosperLoanData.csv')


df_loanData.head()


df_loanData['Term'].value_counts()

36    87778
60    24545
12     1614
Name: Term, dtype: int64


#Check for duplicated records
df_loanData[df_loanData['ListingNumber'].duplicated() & df_loanData['ListingKey'].duplicated()]['ListingNumber'].size

871


df_loanData.shape

(113937, 81)


# Make a copy of the original dataframe for cleaning purposes
df_copy = df_loanData.copy()


df_copy['ListingCreationDate'] = pd.to_datetime(df_copy['ListingCreationDate'])
df_copy['ClosedDate'] = pd.to_datetime(df_copy['ClosedDate'])


df_copy[['ListingCreationDate', 'ClosedDate']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113937 entries, 0 to 113936
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   ListingCreationDate  113937 non-null  datetime64[ns]
 1   ClosedDate           55089 non-null   datetime64[ns]
dtypes: datetime64[ns](2)
memory usage: 1.7 MB


index = df_copy[df_copy['ListingNumber'].duplicated()].index
df_copy.drop(index, axis='rows', inplace=True)


df_copy[df_copy['ListingNumber'].duplicated()]


term_type = pd.CategoricalDtype(categories=[12, 36, 60], ordered=True)
df_copy['Term'] = df_copy['Term'].astype(term_type)


df_copy['Term'].info()

<class 'pandas.core.series.Series'>
Int64Index: 113066 entries, 0 to 113936
Series name: Term
Non-Null Count   Dtype   
--------------   -----   
113066 non-null  category
dtypes: category(1)
memory usage: 993.9 KB


df_copy['LoanStatus']= df_copy['LoanStatus'].astype("category")


df_copy['LoanStatus'].info()

<class 'pandas.core.series.Series'>
Int64Index: 113066 entries, 0 to 113936
Series name: LoanStatus
Non-Null Count   Dtype   
--------------   -----   
113066 non-null  category
dtypes: category(1)
memory usage: 994.1 KB


df_copy['ListingCategory (numeric)'] = df_copy['ListingCategory (numeric)'].astype('category')


df_copy['ListingCategory (numeric)'].info()

<class 'pandas.core.series.Series'>
Int64Index: 113066 entries, 0 to 113936
Series name: ListingCategory (numeric)
Non-Null Count   Dtype   
--------------   -----   
113066 non-null  category
dtypes: category(1)
memory usage: 994.5 KB


df_copy['ProsperRating (numeric)'].info()

<class 'pandas.core.series.Series'>
Int64Index: 113066 entries, 0 to 113936
Series name: ProsperRating (numeric)
Non-Null Count  Dtype   
--------------  -----   
83982 non-null  category
dtypes: category(1)
memory usage: 994.1 KB


def convert_prosperRating():
    """Convert the ProsperRating (numeric) column data type to ordered category data type"""
    prosper_type = pd.CategoricalDtype(categories=[1, 2, 3, 4, 5, 6, 7], ordered=True)
    df_copy['ProsperRating (numeric)'] = df_copy['ProsperRating (numeric)'].astype(prosper_type)


#Convert the ProsperRating (numeric) column data type to ordered category data type
convert_prosperRating()


df_copy['ProsperRating (numeric)'].info()

<class 'pandas.core.series.Series'>
Int64Index: 113066 entries, 0 to 113936
Series name: ProsperRating (numeric)
Non-Null Count  Dtype   
--------------  -----   
83982 non-null  category
dtypes: category(1)
memory usage: 994.1 KB


#Store cleaned data in csv file
df_copy.to_csv('clean_data.csv')


#plot a histogram to showcase the distribution of LoanOriginalAmount
bins = np.arange(0, df_copy['LoanOriginalAmount'].max()+10, 2000)
plt.hist(data=df_copy, x='LoanOriginalAmount', bins=bins);
plt.title('LoanOriginalAmount Histogram')
plt.xlabel('LoanOriginalAmount')
plt.ylabel('Frequency');


#plot a histogram to showcase the distribution of BorrowerRate
bins = np.arange(0, df_copy['BorrowerRate'].max()+0.5, 0.05)
plt.hist(data=df_copy, x='BorrowerRate');
plt.title('BorrowerRate Histogram')
plt.xlabel('BorrowerRate')
plt.ylabel('Frequency');


#plot a histogram to showcase the distribution of AmountDelinquent
plt.hist(data=df_copy, x='AmountDelinquent');
plt.title('AmountDelinquent Histogram')
plt.xlabel('AmountDelinquent')
plt.ylabel('Frequency');


print(len(df_copy))
len(df_copy[df_copy['AmountDelinquent']>100000])

113066

112


#plot a histogram to showcase the distribution of AmountDelinquent

bins = np.arange(0, df_copy['AmountDelinquent'].max()+5, 10000)
plt.hist(data=df_copy, x='AmountDelinquent',bins=bins)
plt.xlim(0, 100000)
plt.title('AmountDelinquent Histogram')
plt.xlabel('AmountDelinquent')
plt.ylabel('Frequency');


#plot a histogram to showcase the distribution of AvailableBankcardCredit
bins = np.arange(0, df_copy['AvailableBankcardCredit'].max()+0.5, 0.05)
plt.hist(data=df_copy, x='AvailableBankcardCredit')
plt.title('AvailableBankcardCredit Histogram')
plt.xlabel('AvailableBankcardCredit')
plt.ylabel('Frequency');


print(len(df_copy))
len(df_copy[df_copy['AvailableBankcardCredit']>200000])

113066

89


#plot a histogram to showcase the distribution of AvailableBankcardCredit
bins = np.arange(0, df_copy['AvailableBankcardCredit'].max(), 20000)
plt.hist(data=df_copy, x='AvailableBankcardCredit')
plt.xlim(0, 200000)
plt.title('AvailableBankcardCredit Histogram')
plt.xlabel('AvailableBankcardCredit')
plt.ylabel('Frequency');


#plot a histogram to showcase the distribution of CreditScoreRangeLower
bins = np.arange(0, df_copy['CreditScoreRangeLower'].max()+10, 2000)
plt.hist(data=df_copy, x='CreditScoreRangeLower');
plt.title('CreditScoreRangeLower Histogram')
plt.xlabel('CreditScoreRangeLower')
plt.ylabel('Frequency');


#plot a histogram to showcase the distribution of DebtToIncomeRatio
bins = np.arange(0, df_copy['DebtToIncomeRatio'].max()+10, 2000)
plt.hist(data=df_copy, x='DebtToIncomeRatio');
plt.title('DebtToIncomeRatio Histogram')
plt.xlabel('DebtToIncomeRatio')
plt.ylabel('Frequency');


#plot a histogram to showcase the distribution of DebtToIncomeRatio
bins = np.arange(0, df_copy['DebtToIncomeRatio'].max()+10, 2000)
plt.hist(data=df_copy, x='DebtToIncomeRatio')
plt.xlim(0, 2)
plt.title('DebtToIncomeRatio Histogram')
plt.xlabel('DebtToIncomeRatio')
plt.ylabel('Frequency');


#plot a pie chart to showcase the distribution of Term of the loan
sorted_counts = df_copy['Term'].value_counts()
plt.pie(sorted_counts, labels = sorted_counts.index, startangle = 90, counterclock = False);
plt.axis('square')
plt.title('A pie chart showcasing the distribution of the loan Term');


print(len(df_copy))
df_copy['Term'].value_counts()

113066

36    87224
60    24228
12     1614
Name: Term, dtype: int64


df_copy['IncomeRange'].unique()

array(['$25,000-49,999', '$50,000-74,999', 'Not displayed', '$100,000+',
       '$75,000-99,999', '$1-24,999', 'Not employed', '$0'], dtype=object)


#Convert the IncomeRange column data type to ordered category data type
income_type = pd.CategoricalDtype(categories=['$0', 'Not employed', '$1-24,999', '$25,000-49,999', '$50,000-74,999', '$75,000-99,999', '$100,000+', 'Not displayed'], ordered=True)
df_copy['IncomeRange'] = df_copy['IncomeRange'].astype(income_type)
#plot a bar plot to showcase the distribution of IncomeRange
base_color = sns.color_palette()[0]
sns.countplot(data=df_copy, x='IncomeRange', color=base_color)
plt.xticks(rotation=90)
plt.ylabel('Frequency')
plt.title('A bar plot of IncomeRange distribution');


#plot a bar plot to showcase the distribution of LoanStatus
sns.countplot(data=df_copy, x="LoanStatus", color=base_color)
plt.xticks(rotation=90)
plt.title('A bar plot of LoanStatus distribution');


df_copy[df_copy['LoanStatus'] != 'Current']['LoanStatus'].value_counts()

Completed                 38061
Chargedoff                11992
Defaulted                  5018
Past Due (1-15 days)        800
Past Due (31-60 days)       361
Past Due (61-90 days)       311
Past Due (91-120 days)      304
Past Due (16-30 days)       265
FinalPaymentInProgress      203
Past Due (>120 days)         16
Cancelled                     5
Name: LoanStatus, dtype: int64


#plot a bar plot to showcase the distribution of EmploymentStatus
sns.countplot(data=df_copy, x="EmploymentStatus", color=base_color)
plt.xticks(rotation=90)
plt.title('A bar plot of EmploymentStatus distribution');


#Convert the ProsperRating (numeric) column data type to ordered category data type
prosper_type = pd.CategoricalDtype(categories=[1, 2, 3, 4, 5, 6, 7], ordered=True)
df_copy['ProsperRating (numeric)'] = df_copy['ProsperRating (numeric)'].astype(prosper_type)
#plot a bar plot to showcase the distribution of ProsperRating (numeric)
sns.countplot(data=df_copy, x="ProsperRating (numeric)", color=base_color)
plt.title('A bar plot of ProsperRating (numeric) distribution');


#plot a scatter plot of LoanOriginalAmount against BorrowerRate
plt.figure(figsize=(5,5))
plt.scatter(data=df_copy, x='BorrowerRate', y='LoanOriginalAmount', alpha=1/10)
plt.xlabel('BorrowerRate')
plt.ylabel('LoanOriginalAmount')
plt.title('Scatter Plot of BorrowerRate against LoanOriginalAmount');


#plot a scatter plot of LoanOriginalAmount against CreditScoreRangeLower
plt.figure(figsize=(5,5))
plt.scatter(data=df_copy, x='CreditScoreRangeLower', y='LoanOriginalAmount', alpha=1/10)
plt.xlim(300, 900)
plt.xlabel('CreditScoreRangeLower')
plt.ylabel('LoanOriginalAmount')
plt.title('Scatter Plot of CreditScoreRangeLower against LoanOriginalAmount');


#plot a scatter plot of LoanOriginalAmount against DebtToIncomeRatio
#plt.figure(figsize=(5,5))
plt.scatter(data=df_copy, x='DebtToIncomeRatio', y='LoanOriginalAmount', alpha=1/10)
plt.xlabel('DebtToIncomeRatio')
plt.ylabel('LoanOriginalAmount')
plt.title('Scatter Plot of DebtToIncomeRatio against LoanOriginalAmount');


#plot a scatter plot of LoanOriginalAmount against AmountDelinquent
#plt.figure(figsize=(5,5))
plt.scatter(data=df_copy, x='AmountDelinquent', y='LoanOriginalAmount', alpha=1/10)
plt.xlabel('AmountDelinquent')
plt.ylabel('LoanOriginalAmount')
plt.title('Scatter Plot of AmountDelinquent against LoanOriginalAmount');


#plot a scatter plot of LoanOriginalAmount against AvailableBankcardCredit
#plt.figure(figsize=(5,5))
plt.scatter(data=df_copy, x='AvailableBankcardCredit', y='LoanOriginalAmount', alpha=1/10)
plt.xlabel('AvailableBankcardCredit')
plt.ylabel('LoanOriginalAmount')
plt.title('Scatter Plot of AvailableBankcardCredit against LoanOriginalAmount');


#plot a box plot depicting relationship between EmploymentStatus and LoanOriginalAmount
base_color = sns.color_palette()[0]
sns.boxplot(data=df_copy, x='EmploymentStatus', y='LoanOriginalAmount', color=base_color)
plt.xticks(rotation=15);
plt.xlabel('EmploymentStatus')
plt.ylabel('LoanOriginalAmount');
plt.title('Box plot depicting relationship between EmploymentStatus and LoanOriginalAmount');


#plot a Violin plot depicting relationship between IncomeRange and LoanOriginalAmount
base_color = sns.color_palette()[0]
sns.violinplot(data=df_copy, x='IncomeRange', y='LoanOriginalAmount', color=base_color, inner=None)
plt.xticks(rotation=15)
plt.xlabel('IncomeRange')
plt.ylabel('LoanOriginalAmount');
plt.title('Violin plot depicting relationship between IncomeRange and LoanOriginalAmount');


#plot a box plot depicting relationship between ProsperRating (numeric) and LoanOriginalAmount
base_color = sns.color_palette()[0]
sns.boxplot(data=df_copy, x='ProsperRating (numeric)', y='LoanOriginalAmount', color=base_color)
plt.xlabel('ProsperRating (numeric)')
plt.ylabel('LoanOriginalAmount');
plt.title('Box plot depicting relationship between ProsperRating (numeric) and LoanOriginalAmount');


#plot a box plot depicting relationship between ProsperRating (numeric) and BorrowerRate
base_color = sns.color_palette()[0]
sns.boxplot(data=df_copy, x='ProsperRating (numeric)', y='BorrowerRate', color=base_color)
plt.xlabel('ProsperRating (numeric)')
plt.ylabel('BorrowerRate')
plt.title('Box plot depicting relationship between ProsperRating (numeric) and BorrowerRate');


#plot a Violin plot depicting relationship between Term and LoanOriginalAmount
sns.violinplot(data=df_copy, x='Term', y='LoanOriginalAmount', color=base_color)
plt.title('Violin plot depicting relationship between Term and LoanOriginalAmount');


#plot scatter plots of BorrowerRate against LoanOriginalAmount and facet using the IncomeRange feature
g = sns.FacetGrid(data = df_copy, col = 'IncomeRange', height = 4, col_wrap=3)
g.map(plt.scatter, 'BorrowerRate', 'LoanOriginalAmount');


#plot a violinplot of Term against LoanOriginalAmount and facet using the LoanStatus feature
g = sns.FacetGrid(data = df_copy, col = 'ProsperRating (numeric)', height = 4, col_wrap=3, sharex=False)
g.map(sns.violinplot, 'Term', 'LoanOriginalAmount');

/home/egunza/anaconda3/lib/python3.9/site-packages/seaborn/axisgrid.py:670: UserWarning: Using the violinplot function without specifying `order` is likely to produce an incorrect plot.
  warnings.warn(warning)


#plot a violinplot of Term against LoanOriginalAmount and facet using the LoanStatus feature
g = sns.FacetGrid(data = df_copy, col = 'LoanStatus', height = 4, col_wrap=3, sharex=False)
g.map(sns.violinplot, 'Term', 'LoanOriginalAmount');

/home/egunza/anaconda3/lib/python3.9/site-packages/seaborn/axisgrid.py:670: UserWarning: Using the violinplot function without specifying `order` is likely to produce an incorrect plot.
  warnings.warn(warning)


#plot a scatter plot of BorrowerRate against LoanOriginalAmount and encode the 
#ProsperRating (numeric) using color.
groups = df_copy.groupby('ProsperRating (numeric)')
for name, group in groups:
    plt.plot(group.BorrowerRate, group.LoanOriginalAmount, marker='o', linestyle='', markersize=6, label=name)
plt.legend()

<matplotlib.legend.Legend at 0x7f157b9dd2b0>

	ListingKey	ListingNumber	ListingCreationDate	CreditGrade	Term	LoanStatus	ClosedDate	BorrowerAPR	BorrowerRate	LenderYield	...	LP_ServiceFees	PercentFunded	Investors
0	1021339766868145413AB3B	193129	2007-08-26 19:09:29.263000000	C	36	Completed	2009-08-14 00:00:00	0.16516	0.1580	0.1380	...	-133.18	1.0	258
1	10273602499503308B223C1	1209647	2014-02-27 08:28:07.900000000	NaN	36	Current	NaN	0.12016	0.0920	0.0820	...	0.00	1.0	1
2	0EE9337825851032864889A	81716	2007-01-05 15:00:47.090000000	HR	36	Completed	2009-12-17 00:00:00	0.28269	0.2750	0.2400	...	-24.20	1.0	41
3	0EF5356002482715299901A	658116	2012-10-22 11:02:35.010000000	NaN	36	Current	NaN	0.12528	0.0974	0.0874	...	-108.01	1.0	158
4	0F023589499656230C5E3E2	909464	2013-09-14 18:38:39.097000000	NaN	36	Current	NaN	0.24614	0.2085	0.1985	...	-60.27	1.0	20

Part I - (Exploration of Loan Data from Prosper)¶

by (Kevin Egunza)¶

Introduction¶

Preliminary Wrangling¶

Data Cleaning¶

Issue 1:¶

Define:¶

Code¶

Test¶

Issue 2:¶

Define:¶

Code¶

Test¶

Issue 3:¶

Define:¶

Code¶

Test¶

Issue 4:¶

Define:¶

Code¶

Test¶

Issue 5:¶

Define:¶

Code¶

Test¶

Issue 6:¶

Define:¶

Code¶

Test¶

The structure of the dataset¶

The main feature of interest in the dataset¶

Features in the dataset that will help support the investigation into the feature of interest¶

Univariate Exploration¶

What is the distribution of the LoanOriginalAmount feature?¶

What is the distribution of the BorrowerRate feature?¶

What is the distribution of the AmountDelinquent feature?¶

What is the distribution of the AvailableBankcardCredit feature?¶

What is the distribution of the CreditScoreRangeLower feature?¶

What is the distribution of the DebtToIncomeRatio feature?¶

What is the distribution of the Term feature?¶

What is the distribution of the IncomeRange feature?¶

What is the distribution of the LoanStatus feature?¶

What is the distribution of the EmploymentStatus feature?¶

What is the distribution of the ProsperRating (numeric) feature?¶

Discuss the distribution(s) of your variable(s) of interest. Were there any unusual points? Did you need to perform any transformations?¶

Of the features you investigated, were there any unusual distributions? Did you perform any operations on the data to tidy, adjust, or change the form of the data? If so, why did you do this?¶

Bivariate Exploration¶

How does the LoanOriginalAmount feature relate to BorrowerRate feature?¶

How does the LoanOriginalAmount feature relate to CreditScoreRangeLower feature?¶

How does the LoanOriginalAmount feature relate to DebtToIncomeRatio feature?¶

How does the LoanOriginalAmount feature relate to AmountDelinquent feature?¶

How does the LoanOriginalAmount feature relate to AvailableBankcardCredit feature?¶

How does the LoanOriginalAmount feature relate to EmploymentStatus feature?¶

How does the LoanOriginalAmount feature relate to IncomeRange feature?¶

How does the LoanOriginalAmount feature relate to ProsperRating (numeric) feature?¶

How does the ProsperRating (numeric) feature relate to BorrowerRate feature?¶

How does the LoanOriginalAmount feature relate to Term feature?¶

Talk about some of the relationships you observed in this part of the investigation. How did the feature(s) of interest vary with other features in the dataset?¶

Did you observe any interesting relationships between the other features (not the main feature(s) of interest)?¶

Multivariate Exploration¶

How do the IncomeRange, BorrowerRate, and LoanOriginalAmount features interact with each other?¶

How do the ProsperRating (numeric), BorrowerRate, and LoanOriginalAmount features interact with each other?¶

Talk about some of the relationships you observed in this part of the investigation. Were there features that strengthened each other in terms of looking at your feature(s) of interest?¶

Were there any interesting or surprising interactions between features?¶

Conclusions¶

How do the `IncomeRange`, `BorrowerRate`, and `LoanOriginalAmount` features interact with each other?¶

How do the `ProsperRating (numeric)`, `BorrowerRate`, and `LoanOriginalAmount` features interact with each other?¶